The source R Markdown document is available here: Download identify_gene_promoter_regions.Rmd. # Load R packages

suppressMessages(library(tidyverse))
suppressMessages(library(AnnotationHub))
suppressMessages(library(rtracklayer))
suppressMessages(library(glue))
suppressMessages(library(biomaRt))
suppressMessages(library(RColorBrewer))
suppressMessages(library(parallel))

1 Identify protein-coding TSSs

Select TSS with the highest FANTOM CAT transcription initiation evidence score (TIEScore).

 # If the data directory doesn't already exist, create it
if(!dir.exists("data")){ dir.create(file.path("data"), showWarnings = FALSE) }

# Obtain FANTOM CAT robust annotation if not already done
if(!file.exists("data/FANTOM_CAT.lv3_robust.gtf.gz")){
  download.file(url="http://fantom.gsc.riken.jp/5/suppl/Hon_et_al_2016/data/assembly/lv3_robust/FANTOM_CAT.lv3_robust.gtf.gz", destfile = "data/FANTOM_CAT.lv3_robust.gtf.gz")
}

# Select protein-coding tx TSS that has the greatest TIEScore support
# Transcription Initiation Evidence Score (TIEScore) is a custom metric produced for FANTOM CAT to quantify the likelihood that a CAGE transcription start site is genuine. We select the transcript with the highest TIEScore.
FANTOM_TSS <- as_tibble(readGFF("data/FANTOM_CAT.lv3_robust.gtf.gz")) %>%
  group_by(gene_id) %>% 
  mutate(geneClass=unique(geneClass)[1], geneSuperClass=unique(geneSuperClass)[1], gene_name=unique(gene_name)[1]) %>% # add to all rows of gene annotation
  mutate(number_entries = dplyr::n()) %>% # Should be at least three entries for a gene (gene + transcript + exon)
  filter(number_entries>2) %>%
  arrange(desc(TIEScore)) %>%
  filter(row_number() == 1) %>%
  ungroup() %>% 
  filter(geneSuperClass=="all_mRNA" & str_detect(gene_id, "ENSG")) %>% # Exclude "CATG___" genes, which appear to be lncRNAs that have been marked as having coding potential
  filter(type=="transcript" & seqid%in%c(glue('chr{1:22}'), glue('chr{c("X","Y","M")}'))) %>% # restrict to reference chromosomes
  dplyr::select(seqid, start, end, strand, gene_id, gene_name, TIEScore) %>%
  arrange(as.character(seqid), start) %>%
  mutate(gene_id = str_replace_all(gene_id, "ENSGR", "ENSG")) %>% # changes 7 gene IDs from pseudoautosomal regions
  mutate(end = case_when(strand=="+" ~ start, TRUE ~ end), # Change start and end coordinates from tx start/end to TSS
         start = case_when(strand=="+" ~ start, TRUE ~ end),
         ensembl_gene_id = str_split_fixed(gene_id,"[.]",2)[,1]) %>%
  dplyr::select(-gene_id)

# What information do we have?
head(FANTOM_TSS)
# How many genes?
nrow(FANTOM_TSS)
## [1] 19002
#[1] 19002

2 Add HGNC and NCBI gene IDs

# Get gene info from biomaRt
if(!file.exists("data/biomaRt_query.rds")){
gene_info <- getBM(attributes = c("ensembl_gene_id", "ensembl_gene_id_version", "percentage_gene_gc_content", "hgnc_id", "hgnc_symbol", "entrezgene_id"), 
      filters = "ensembl_gene_id", 
      values = FANTOM_TSS$ensembl_gene_id, 
      mart = useMart(biomart="ENSEMBL_MART_ENSEMBL", host="sep2019.archive.ensembl.org", path="/biomart/martservice", dataset="hsapiens_gene_ensembl"))
saveRDS(gene_info, "data/biomaRt_query.rds")
} else {
  gene_info <- readRDS("data/biomaRt_query.rds")
}

# Sometimes more than one NCBI gene ID (entrezgene_id) is returned. In these cases, collapse to one row and join values separated by commas
gene_info <- gene_info %>% 
  group_by(ensembl_gene_id) %>%
  summarise(ensembl_gene_id_version = paste(unique(ensembl_gene_id_version), collapse=", "),
            percentage_gene_gc_content = paste(unique(percentage_gene_gc_content), collapse=", "),
            hgnc_id = paste(unique(hgnc_id), collapse=", "),
            hgnc_symbol = paste(unique(hgnc_symbol), collapse=", "),
            entrezgene_id = paste(unique(entrezgene_id), collapse=", ")) %>%
  ungroup()

# Note that some ENSEMBL gene IDs present in hg19 are no longer in the current version of ENSEMBL
dim(gene_info)
## [1] 18610     6
dim(FANTOM_TSS)
## [1] 19002     7
# Join by ensembl_gene_id
FANTOM_TSS <- merge(x = FANTOM_TSS, y = gene_info, by = "ensembl_gene_id", all.x = FALSE) %>%
  arrange(as.character(seqid), start)

# Set column names
colnames(FANTOM_TSS) <- c("ensembl_gene_id", "TSS_chromosome", "TSS_start", "TSS_end", "TSS_strand","FANTOM_gene_name", "TIEScore", "ensembl_gene_id_version", "percentage_gene_gc_content", "HGNC_ID", "HGNC_Symbol", "NCBI_gene_ID")

# clean up
rm(gene_info)

3 DNase peaks near TSSs

Import DNase narrowPeak ranges from the Roadmap Epigenomics project.

# Get annotation hub
ahub <- AnnotationHub()
snapshotDate(ahub)
## [1] "2019-10-29"
# [1] '2019-10-29'

# Create 'promoter' GRanges - 5Kb interval centred on the TSS
gene_promoters <- promoters(makeGRangesFromDataFrame(FANTOM_TSS, keep.extra.columns = TRUE), 
    upstream = 2500, downstream = 2500)

# Extract Roadmap Epigenomics Project DNase narrowPeak records for 111 uniformly
# processed human epigenomes (53 have DNase-seq data available)
npDNase <- query(ahub, c("EpigenomeRoadMap", "Narrow DNasePeaks for consolidated epigenomes", 
    "macs2"))
# How many files returned?
length(npDNase)
## [1] 53
# Take a quick look
head(npDNase)
## AnnotationHub with 6 records
## # snapshotDate(): 2019-10-29 
## # $dataprovider: BroadInstitute
## # $species: Homo sapiens
## # $rdataclass: GRanges
## # additional mcols(): taxonomyid, genome, description,
## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
## #   rdatapath, sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH29873"]]' 
## 
##             title                         
##   AH29873 | E003-DNase.macs2.narrowPeak.gz
##   AH29903 | E004-DNase.macs2.narrowPeak.gz
##   AH29924 | E005-DNase.macs2.narrowPeak.gz
##   AH29949 | E006-DNase.macs2.narrowPeak.gz
##   AH29972 | E007-DNase.macs2.narrowPeak.gz
##   AH29995 | E008-DNase.macs2.narrowPeak.gz
# List all of the AnnotationHub identifiers for reproducibility
names(npDNase)
##  [1] "AH29873" "AH29903" "AH29924" "AH29949" "AH29972" "AH29995" "AH30078"
##  [8] "AH30129" "AH30138" "AH30177" "AH30190" "AH30199" "AH30274" "AH30283"
## [15] "AH30308" "AH30317" "AH30326" "AH30340" "AH30460" "AH30469" "AH30477"
## [22] "AH30485" "AH30494" "AH30503" "AH30512" "AH30528" "AH30537" "AH30546"
## [29] "AH30555" "AH30564" "AH30573" "AH30582" "AH30603" "AH30612" "AH30627"
## [36] "AH30688" "AH30720" "AH30743" "AH30755" "AH30767" "AH30779" "AH30791"
## [43] "AH30803" "AH30815" "AH30828" "AH30841" "AH30853" "AH30865" "AH30877"
## [50] "AH30890" "AH31947" "AH31961" "AH31988"
# Retrieve all of the narrowPeak DNase data as GRanges, keeping only the ranges
# that overlap our annotated promoters (promoter_hg19) Define GRanges to hold
# just the narrowPeak regions that overlap promoters
if (!file.exists("data/DNase_narrowPeak_overlap_promoters.rds")) {
    np_promoter <- GRanges()
    for (i in 1:length(npDNase)) {
        # Import narrowPeak file
        np <- npDNase[[i]]
        # Add tissue info
        np$tissue <- mcols(npDNase)$tags[[i]][8]
        # Normalise scores to be between zero and one
        scores <- np$pValue
        scores <- scores - min(scores)
        scores <- scores/max(scores)
        np$norm_score <- scores
        # Restrict to the subset of peaks that overlaps a protein-coding promoter region
        # and add to np_promoter
        np_promoter <- c(np_promoter, np[overlapsAny(np, gene_promoters)])
        # updata
        print(glue("Finished processing file {i} of {length(npDNase)}"))
    }
    saveRDS(np_promoter, "data/DNase_narrowPeak_overlap_promoters.rds")
    rm(i, np_promoter, np)
}

# clean up
rm(ahub, npDNase)

4 Identify single DNase peak position for each gene

Quick visual exploration.

# Import DNase narrowPeak data
np_promoter <- readRDS("data/DNase_narrowPeak_overlap_promoters.rds")

# Define function that accepts a gene ID and creates some simple plots of the promoter and nearby DNase narrowPeak ranges
plot_DNase <- function(gene_name, three_prime_width=2500, five_prime_width=2500){
  # Get the promoter GRanges for the gene of interest
  goi <- gene_promoters[gene_promoters$HGNC_Symbol==gene_name]
  # Find DNase narrowPeaks that overlap the promoter
  overlapping_narrowPeaks <- np_promoter[overlapsAny(np_promoter,goi)]
  # Order GRanges by norm_score
  overlapping_narrowPeaks <- overlapping_narrowPeaks[order(overlapping_narrowPeaks$norm_score, decreasing = TRUE)]
  # Create vector of colour values, based on the narrowPeak$norm_score value
  plot_cols <- colorRampPalette(brewer.pal(11,"Spectral"))(100)[as.numeric(cut(overlapping_narrowPeaks$norm_score,breaks = 100))]
  
  ## Plot the point-source called for each DNase narrowPeak range
  # Create empty plot
  par(mar=c(5.1,25,4.1,2.1)) # bottom, left, top and right margins
  plot("",xlim=c(start(goi)-three_prime_width, end(goi)+five_prime_width), ylim=c(-2,length(overlapping_narrowPeaks)+1), ylab="", yaxt="n", xlab="chromosome position",
       main=gene_name)
  axis(side=2, labels = overlapping_narrowPeaks$tissue, at = 1:length(overlapping_narrowPeaks)+0.5,las=2)
  rect(xleft = start(goi), ybottom = -2, xright = end(goi), ytop = 0, col = "red", border=NA)
  # Plot points for point-source of each narrowPeak
  for(i in 1:length(overlapping_narrowPeaks)){
    points(start(overlapping_narrowPeaks[i]) + overlapping_narrowPeaks[i]$peak, i+0.5, pch=21, col="black", bg=plot_cols[i], cex=1.5)
  }

  # Add a horizontal dashed red line, indicating where the cumulative fraction of norm_score values across all narrowPeak ranges exceeds 0.75 (i.e. the normalised scores of the narrowPeak ranges below this line make up 75% of the total)
  cum_sum_cutoff <- which(cumsum(overlapping_narrowPeaks$norm_score)/sum(overlapping_narrowPeaks$norm_score)>0.75)[1]
  abline(h=cum_sum_cutoff, col="red", lty=2)
  # Restrict the DNase narrowPeaks to those below the line
  overlapping_narrowPeaks <- overlapping_narrowPeaks[1:cum_sum_cutoff]
  # Restrict the narrowPeak ranges to just the point source
  start(overlapping_narrowPeaks) <- start(overlapping_narrowPeaks) + overlapping_narrowPeaks$peak
  end(overlapping_narrowPeaks) <- start(overlapping_narrowPeaks)
  
  # Check whether the point sources are spread across a region >=50bp - otherwise a rolling mean can't be calculated 
  if(diff(range(start(overlapping_narrowPeaks)))>=50){
  # Create a data frame to hold the rolling mean of the norm_score values using a 50bp sliding window
  roll_mean_scores <- data.frame(pos = min(start(overlapping_narrowPeaks)):max(start(overlapping_narrowPeaks)),
                  score = 0)
  # Add the norm-score values
  for(i in 1:length(overlapping_narrowPeaks)){
    position <- roll_mean_scores$pos==start(overlapping_narrowPeaks[i])
    roll_mean_scores$score[position] <- roll_mean_scores$score[position] + overlapping_narrowPeaks[i]$norm_score
  }
  # Calculate the position where the rolling mean of norm_scores is greatest
  roll_mean_peak <- roll_mean_scores$pos[1] + which.max(zoo::rollmean(x=roll_mean_scores$score, k=50))+25
  # Add two blue lines to mark the the 50bp region around the roll_mean_peak
  abline(v=roll_mean_peak-25, col="blue", lty=2);
  abline(v=roll_mean_peak+25, col="blue", lty=2);

  # Restrict the narrowPeaks point sources to those within the selected region (those peaks below the dashed red line and between the two solid blue lines)
  overlapping_narrowPeaks <- overlapping_narrowPeaks[(start(overlapping_narrowPeaks) > roll_mean_peak-25) & (start(overlapping_narrowPeaks) < roll_mean_peak+25)]
  }
  
  # Calculate weighted mean position of the remaining peaks, using the norm_scores as the weights
  centre_pos <-  as.integer(round(weighted.mean(x = start(overlapping_narrowPeaks), w = overlapping_narrowPeaks$norm_score), 0))
  # Mark the position of the DNase peak centre with a thick blue line
  abline(v=centre_pos, col="blue", lwd=2)
  par(mar=c(5.1, 4.1, 4.1, 2.1))
}

plot_DNase("PAX6")

plot_DNase("GAPDH")

plot_DNase("MMP3")

# Clean up
rm(plot_DNase)

Our approach of identifying the strongest DNase peak appears to be working fairly well for now. Proceed to apply the method to all genes, adding the position of the DNase peak to the FANTOM_TSS data frame.

# Define function to identify peaks
identify_DNase <- function(row_num){
  # Get the promoter GRanges for the gene of interest
  goi <- gene_promoters[row_num]
  # Find DNase narrowPeaks that overlap the promoter
  overlapping_narrowPeaks <- np_promoter[overlapsAny(np_promoter,goi)]
  # Order GRanges by norm_score
  overlapping_narrowPeaks <- overlapping_narrowPeaks[order(overlapping_narrowPeaks$norm_score, decreasing = TRUE)]
  
  # Check that we have at least one peak to work with, otherwise there's nothing to be done, in which case return NA
  if(length(overlapping_narrowPeaks)==0){return(NA)}
  
  # identify where the cumulative fraction of norm_score values across all narrowPeak ranges exceeds 0.75 (i.e. the normalised scores of the narrowPeak ranges below this line make up 75% of the total)
  cum_sum_cutoff <- which(cumsum(overlapping_narrowPeaks$norm_score)/sum(overlapping_narrowPeaks$norm_score)>0.75)[1]
  # Restrict the DNase narrowPeaks
  overlapping_narrowPeaks <- overlapping_narrowPeaks[1:cum_sum_cutoff]
  # Restrict the narrowPeak ranges to just the point source
  start(overlapping_narrowPeaks) <- start(overlapping_narrowPeaks) + overlapping_narrowPeaks$peak
  end(overlapping_narrowPeaks) <- start(overlapping_narrowPeaks)
  
  # Check whether the point sources are spread across a region >=50bp - otherwise a rolling mean can't be calculated 
  if(diff(range(start(overlapping_narrowPeaks)))>=50){
  # Create a data frame to hold the rolling mean of the norm_score values using a 50bp sliding window
  roll_mean_scores <- data.frame(pos = min(start(overlapping_narrowPeaks)):max(start(overlapping_narrowPeaks)),
                  score = 0)
  # Add the norm-score values
  for(i in 1:length(overlapping_narrowPeaks)){
    position <- roll_mean_scores$pos==start(overlapping_narrowPeaks[i])
    roll_mean_scores$score[position] <- roll_mean_scores$score[position] + overlapping_narrowPeaks[i]$norm_score
  }
  # Calculate the position where the rolling mean of norm_scores is greatest
  roll_mean_peak <- roll_mean_scores$pos[1] + which.max(zoo::rollmean(x=roll_mean_scores$score, k=50))+25
  
  # Restrict the narrowPeaks point sources to those within 50bp of the peak
  overlapping_narrowPeaks <- overlapping_narrowPeaks[(start(overlapping_narrowPeaks) > roll_mean_peak-25) & (start(overlapping_narrowPeaks) < roll_mean_peak+25)]
  }
  
  # Calculate weighted mean position of the remaining peaks, using the norm_scores as the weights
  centre_pos <-  as.integer(round(weighted.mean(x = start(overlapping_narrowPeaks), w = overlapping_narrowPeaks$norm_score), 0))
  return(centre_pos)
}

# Apply function to identify dominant DNase peak for all genes
if(!file.exists("data/centre_pos.rds")){
  detectCores()
  centre_pos <- unlist(mclapply(1:length(gene_promoters), identify_DNase))
  saveRDS(centre_pos, "data/centre_pos.rds")
} else {
  centre_pos <- readRDS("data/centre_pos.rds")
}

if(!file.exists("data/TSS_DNase.rds")){
  saveRDS(FANTOM_TSS, "data/TSS_DNase.rds")
} else {
  FANTOM_TSS <- readRDS("data/TSS_DNase.rds")
}